Input

Get list of articles and of MeSH terms

# Read json with MeSH terms by paper
mesh_terms_by_article_list = read_json('./../Data/MeSH_terms_by_paper.json', simplifyVector = TRUE)

Preprocess MeSH terms

  • Remove subheadings/qualifiers

  • Separate MeSH terms with commas into two different terms and keep only the last, most specific, one

qualifiers = c('abnormalities', 'administration & dosage', 'adverse effects', 'analogs & derivatives', 'analysis', 
               'anatomy & histology', 'antagonists & inhibitors', 'biosynthesis', 'blood supply', 'cerebrospinal fluid',
               'chemical synthesis', 'chemically induced', 'classification', 'complications', 'congenital',
               'cytology', 'deficiency', 'diagnosis', 'diagnostic imaging', 'diet therapy', 'drug effects', 'drug therapy',
               'economics', 'education', 'embryology', 'enzymology', 'epidemiology', 'ethics', 'ethnology', 'etiology',
               'growth & development', 'history', 'immunology', 'injuries', 'innervation', 'instrumentation',
               'isolation & purification', 'legislation & jurisprudence', 'manpower', 'metabolism', 'methods', 'microbiology',
               'mortality', 'nursing', 'organization & administration', 'parasitology', 'pathology', 'pharmacokinetics',
               'pharmacology', 'physiology', 'physiopathology', 'poisoning', 'prevention & control', 'psychology',
               'radiation effects', 'radiotherapy', 'rehabilitation', 'secondary', 'secretion', 'standards',
               'statistics & numerical data', 'supply & distribution', 'surgery', 'therapeutic use', 'therapy', 'toxicity',
               'transmission', 'trends', 'ultrastructure', 'urine', 'utilization', 'veterinary', 'virology')

qualifiers_all = c(qualifiers, 'agonists', 'blood', 'chemistry', 'genetics')

qualifiers_regex = paste(qualifiers, collapse='|')

for(article in names(mesh_terms_by_article_list)){
 
  mesh_terms = mesh_terms_by_article_list[[article]]
  new_mesh_terms = c()
  
  if(!is.na(mesh_terms[1])){
    
    for(mesh_term in mesh_terms){

      # Look for simple qualifiers and delete them
      match_general = grepl(qualifiers_regex, mesh_term)
      match_agonists = grepl('agonists', mesh_term) & !grepl('ntagonists', mesh_term)
      match_blood = grepl('blood', mesh_term) & !grepl('blood supply', mesh_term)
      match_chemistry = grepl('chemistry', mesh_term) & !grepl('immunohistochemistry', mesh_term)
      match_genetics = grepl('genetics', mesh_term) & !grepl('Xgenetics', mesh_term)
      
      if(match_agonists) mesh_term = gsub('agonists', '', mesh_term)
      if(match_blood) mesh_term = gsub('blood', '', mesh_term)
      if(match_chemistry) mesh_term = gsub('chemistry', '', mesh_term)
      if(match_genetics) mesh_term = gsub('genetics', '', mesh_term)
      if(match_general) mesh_term = gsub(qualifiers_regex, '', mesh_term)
      
      # Separate MeSH terms by commas
      mesh_term = mesh_term %>% strsplit(', ') %>% unlist
      mesh_term = mesh_term[length(mesh_term)] # keep last term
      
      # Update
      new_mesh_terms = c(new_mesh_terms, mesh_term)
    }
  
    # Update article entries
    mesh_terms_by_article_list[[article]] = unique(new_mesh_terms)
  }
  
}

rm(qualifiers, qualifiers_all, qualifiers_regex, article, mesh_term, mesh_terms, new_mesh_terms,
   match_general, match_agonists, match_blood, match_genetics)
# Get list of all mesh terms
all_mesh_terms = c()
n = 0

for(mesh_terms_in_article in mesh_terms_by_article_list){
  all_mesh_terms = unique(c(all_mesh_terms, mesh_terms_in_article))
  n = c(n, length(all_mesh_terms))
}

mesh_terms_counts = data.frame('articles' = seq(1,length(n)), 'mesh_terms'=n)

print(paste0('Articles: ', length(mesh_terms_by_article_list), '       Mesh terms: ', length(all_mesh_terms)))
## [1] "Articles: 3709       Mesh terms: 4061"
ggplotly(mesh_terms_counts %>% ggplot(aes(articles, mesh_terms, color='#434343')) + geom_line() +
         geom_smooth(method='lm', color='#666666', se=FALSE, size=0.5) + theme_minimal() + theme(legend.position='none') +
         ggtitle('Increase in number of MeSH terms by each new article'))
rm(mesh_terms_in_article, n, mesh_terms_counts)

Create article-MeSH term matrix

all_articles = names(mesh_terms_by_article_list)

article_mesh_term_mat = data.frame(matrix(0, nrow=length(all_articles), ncol=sum(!is.na(all_mesh_terms))))
rownames(article_mesh_term_mat) = all_articles
colnames(article_mesh_term_mat) = all_mesh_terms[!is.na(all_mesh_terms)]

for(article in all_articles){
  article_mesh_terms = mesh_terms_by_article_list[[article]]
  if(sum(is.na(article_mesh_terms))==0){
    article_mesh_term_mat[article, article_mesh_terms] = 1
  }
}

rm(mesh_terms_by_article_list, article, article_mesh_terms)

An important proportion of articles (almost 10%) don’t have MeSH terms

mesh_terms_by_article = data.frame('id' = rownames(article_mesh_term_mat), 'n_mesh_terms' = rowSums(article_mesh_term_mat))

bins = max(mesh_terms_by_article$n_mesh_terms)-min(mesh_terms_by_article$n_mesh_terms)+1
ggplotly(mesh_terms_by_article %>% ggplot(aes(n_mesh_terms)) + geom_histogram(bins=bins, alpha=0.6) + 
              ggtitle('Distribution of number of MeSH terms by article') + theme_minimal())
rm(bins)

Most MeSH terms are present in only a few articles

articles_by_mesh_term = data.frame('id' = as.character(colnames(article_mesh_term_mat)),
                                   'n_articles' = colSums(article_mesh_term_mat), stringsAsFactors = FALSE)

bins = max(articles_by_mesh_term$n_articles)-min(articles_by_mesh_term$n_articles)+1
ggplotly(articles_by_mesh_term %>% ggplot(aes(n_articles)) + geom_histogram(bins=bins, alpha=0.6) + 
         scale_x_sqrt() + scale_y_sqrt() + theme_minimal() +
         ggtitle('Distribution of number of articles that contain each MeSH term'))
rm(bins)

Many of the most popular MeSH terms aren’t useful

top_n_counts = 50
top_mesh_terms = articles_by_mesh_term %>% top_n(top_n_counts, wt=n_articles)

ggplotly(top_mesh_terms %>% ggplot(aes(reorder(id, n_articles), n_articles, fill=n_articles)) + 
         geom_bar(stat='identity') + scale_fill_viridis_c() + coord_flip() + 
         ggtitle(paste0('Top ', top_n_counts, ' MeSH Terms')) + 
         xlab('') + ylab('No. Articles') + theme_minimal() + theme(legend.position='none'))
rm(mesh_terms_by_article, bins)
## Warning in rm(mesh_terms_by_article, bins): object 'bins' not found

Filters:

  • Articles that have no MeSH terms
keep_articles = rowSums(article_mesh_term_mat)>0
print(paste0(sum(!keep_articles), '/', nrow(article_mesh_term_mat), ' articles have no MeSH terms'))
## [1] "363/3709 articles have no MeSH terms"


  • MeSH terms only present in a single article
keep_mesh_terms = colSums(article_mesh_term_mat)>1
print(paste0(sum(!keep_mesh_terms), '/', ncol(article_mesh_term_mat), ' MeSH terms are mentioned in a single article'))
## [1] "1662/4060 MeSH terms are mentioned in a single article"
article_mesh_term_mat = article_mesh_term_mat[keep_articles, keep_mesh_terms]
articles_by_mesh_term = articles_by_mesh_term %>% filter(id %in% colnames(article_mesh_term_mat))

print(paste0('Remaining Articles: ', nrow(article_mesh_term_mat), '       Mesh terms: ', ncol(article_mesh_term_mat)))
## [1] "Remaining Articles: 3346       Mesh terms: 2398"
rm(keep_articles, keep_mesh_terms)


  • Terms that are not related to the type of experiment that was conducted
keep_terms = c('abnormalit', 'acid', 'aging', 'allele', 'amin', 'analgesic', 'analysis', 'ancestry', 'astrocyte',
               'axon', 'behaviour', 'binding', 'biomark', 'blot', 'brain', 'case-control', 'cell', 'cereb', 
               'channel', 'chemistry', 'chromatin', 'chromosome', 'circadian', 'clon', 'cluster', 'codon', 
               'cognit', 'cohort', 'consanguinity', 'cortex', 'cpg', 'culture', 'dendrit', 'develop',
               'disease', 'disorder', 'dna', 'dopamin', 'drug', 'electro', 'enzym', 'epilepsy', 'etiology',
               'exome', 'exon', 'fibroblast', 'fluorescence', 'gabapentin', 'gen', 'haplo', 'heart', 'hippocamp', 
               'histon', 'hypothalamus', 'imaging', 'immun', 'in situ', 'in vitro', 'intron', 'kinase', 'knockout', 
               'link', 'megalencephaly', 'membran', 'mental', 'messenger', 'metabo', 'methyl', 'microcephaly',
               'microscop', 'microtubules', 'missense', 'model', 'molecul', 'muta', 'neuro', 'nonsense', 'nucleotide',
               'oxytocin', 'pair', 'pathway', 'peptid', 'pervasive', 'phenotype', 'phospor', 'polymerase', 
               'polymorphism', 'promoter', 'protein', 'psychiatric', 'receptor', 'recessive', 'region', 'regulat',
               'risk', 'rna', 'seizure', 'sequence', 'serotonin', 'sibling', 'signal', 'spectrometry',
               'splicing', 'statistics', 'striatum', 'synap', 'technique', 'trait loci', 'transcript',
               'transfection', 'transloc', 'tumor', 'vasopressin', 'zygote')

articles_by_mesh_term = articles_by_mesh_term %>% 
                        mutate('keep'=ifelse(grepl(paste(keep_terms,collapse='|'), tolower(id)), TRUE, FALSE)) %>%
                        filter(keep)

article_mesh_term_mat = article_mesh_term_mat %>% dplyr::select(articles_by_mesh_term$id)
article_mesh_term_mat = article_mesh_term_mat[rowSums(article_mesh_term_mat)>0, ]

print(paste0('Articles: ', nrow(article_mesh_term_mat), '       Mesh terms: ', ncol(article_mesh_term_mat)))
## [1] "Articles: 3344       Mesh terms: 1147"
rm(keep_terms)
top_n_counts = 50
top_mesh_terms = articles_by_mesh_term %>% top_n(top_n_counts, wt=n_articles)
ggplotly(top_mesh_terms %>% ggplot(aes(reorder(id, n_articles), n_articles, fill=n_articles)) + 
         geom_bar(stat='identity') + scale_fill_viridis_c() + coord_flip() + 
         ggtitle(paste0('New top ', top_n_counts, ' MeSH Terms')) + 
         xlab('') + ylab('No. Articles') + theme_minimal() + theme(legend.position='none'))
rm(top_n_counts, top_mesh_terms)

Network:

MeSH Terms Network

sparse_article_mesh_term_mat = Matrix(as.matrix(article_mesh_term_mat), sparse=TRUE)

# EDGES
mesh_mesh_mat = t(sparse_article_mesh_term_mat) %*% sparse_article_mesh_term_mat

edges = summary(mesh_mesh_mat) %>% rename('from'=i, 'to'=j, 'count'=x) %>%
        mutate(from = colnames(article_mesh_term_mat)[from],
               to = colnames(article_mesh_term_mat)[to]) %>%
        filter(from!=to)

# Convert count to fraction
article_mesh_term_colsums = colSums(article_mesh_term_mat)
names(article_mesh_term_colsums) = colnames(article_mesh_term_mat)
edges = edges %>% mutate('weight'=count/(article_mesh_term_colsums[from]+article_mesh_term_colsums[to]))

# NODES
nodes = articles_by_mesh_term %>% mutate('id'=as.character(id), 'title'=as.character(id), 'value'=n_articles)

print(paste0('Nodes: ', nrow(nodes), '        Edges: ', nrow(edges)/2))
## [1] "Nodes: 1147        Edges: 50085"
graph = graph_from_data_frame(edges, vertices=nodes, directed=FALSE)
graph = graph %>% simplify(edge.attr.comb='first')

eigen_centrality_output = eigen_centrality(graph)
eigencentr = as.numeric(eigen_centrality_output$vector)

plot_data = data.frame('id'=graph %>% get.vertex.attribute('name'), 'eigencentrality'=eigencentr) %>%
            top_n(50, wt=eigencentrality)
ggplotly(plot_data %>% ggplot(aes(reorder(id, eigencentrality), eigencentrality, fill=eigencentrality)) + 
         geom_bar(stat='identity') + scale_fill_viridis_c() + coord_flip() + 
         ggtitle(paste0('MeSH terms with the highest Network centrality')) + 
         xlab('') + ylab('Eigencentrality') + theme_minimal() + theme(legend.position='none'))
rm(plot_data)

Clustering

comms_louvain = cluster_louvain(graph)
modules = comms_louvain %>% membership

color_pal = viridis_pal()(max(modules)) %>% substr(1,7)
graph = graph %>% set_vertex_attr('eigencentrality', value=eigencentr) %>%
                  set_vertex_attr('module', value=as.numeric(modules)) %>% 
                  set_vertex_attr('color_modules', value=color_pal[as.numeric(modules)]) %>%
                  set_vertex_attr('color', value=color_pal[as.numeric(modules)])

comm_sizes = sizes(comms_louvain) %>% data.frame %>% rename(Module=Community.sizes, size=Freq)
ggplotly(comm_sizes %>% ggplot(aes(Module, size, fill=size)) + geom_bar(stat='identity') + 
         ggtitle('Number of MeSH Terms in each module') + theme_minimal() + theme(legend.position = 'none'))
visIgraph(graph) %>% visOptions(selectedBy='module', highlightNearest=TRUE) %>% 
                     visIgraphLayout(randomSeed=123)
rm(color_pal, eigen_centrality_output, eigencentr, comm_sizes, edges, nodes)

Most important MeSH terms for modules with more than 50 nodes

module_info = tibble('module'=character(), 'top_term'=character(), size=integer())

for(i in names(sizes(comms_louvain))){
  
  module_vertices = names(modules)[modules==i]
  
  # Creat subgraph
  module_graph = induced_subgraph(graph, module_vertices)
  
  # Calculate eigencentrality
  eigen_centrality_output = eigen_centrality(module_graph)
  eigencentr = as.numeric(eigen_centrality_output$vector)
  names(eigencentr) = module_vertices
  
  module_info = module_info %>% add_row('module'=i, 'top_term'=names(eigencentr)[eigencentr==max(eigencentr)][1], 
                                        'size'=length(module_vertices))
    
  if(length(module_vertices)>50){
    
    # Plot eigencentrality of top 10 MeSH terms
    plot_data = data.frame('mesh_term'=names(eigencentr), 'eigencentrality'=eigencentr) %>%
                top_n(10, wt=eigencentrality)
    print(plot_data %>% ggplot(aes(reorder(mesh_term, eigencentrality), eigencentrality, fill=eigencentrality)) + 
         geom_bar(stat='identity') + scale_fill_viridis_c() + coord_flip() + 
         ggtitle(paste0('Top terms for Module ', i)) + 
         xlab('MeSH Terms') + ylab('Eigencentrality') + theme_minimal() + theme(legend.position='none'))
  }

}

rm(i, module_vertices, module_graph, eigen_centrality_output, eigencentr, plot_data, comms_louvain)

Module Network

  • Collapsing edges: sum all original weights and divide by the product of the number of elements in both (the number of possible interactions they could have)

  • The Network plot is not that informative

module_graph = contract(graph, graph %>% get.vertex.attribute('module'), vertex.attr.comb='first') %>%
               simplify(edge.attr.comb='sum') %>% set_vertex_attr('name', value=module_info$module) %>%
                                                  set_vertex_attr('title', value=module_info$top_term) %>%
                                                  set_vertex_attr('module_size', value=module_info$size) %>%
                                                  set_vertex_attr('value', value=module_info$size)

nodes_module_graph  = module_graph %>% as_data_frame(what='vertices') %>% mutate('id'=name)
edges_module_graph = module_graph %>% as_data_frame() %>% 
                     mutate('corrected_weight' = weight/(nodes_module_graph$module_size[as.numeric(from)]*
                                                 nodes_module_graph$module_size[as.numeric(to)])*100)

module_graph = module_graph %>% set_edge_attr('weight', value=edges_module_graph$corrected_weight)

visIgraph(module_graph) %>% visIgraphLayout(randomSeed=123) %>% visOptions(highlightNearest=TRUE)
rm(module_info, nodes_module_graph, edges_module_graph)
adj_mat = module_graph %>% as_adjacency_matrix(attr='weight') %>% as.matrix
adj_mat = adj_mat

heatmap.2(adj_mat, cellnote=round(adj_mat,2), dendrogram='none', notecol='gray', scale='none', trace='none',
          key=FALSE, xlab='Modules', ylab='Modules', col=colorRampPalette(brewer.pal(8, 'RdPu'))(25))

rm(adj_mat)

Save file with the modules each paper belongs to through its mesh terms

mesh_term_module = graph %>% as_data_frame(what='vertices') %>% dplyr::select(name, module)

mesh_term_module_mat = matrix(0, nrow(mesh_term_module), max(mesh_term_module$module))
rownames(mesh_term_module_mat) = mesh_term_module$name
colnames(mesh_term_module_mat) = sort(unique(mesh_term_module$module))

for(i in seq(1, nrow(mesh_term_module))){
  mesh_term_module_mat[i,mesh_term_module[i,2]] = 1
}

if(!all(colnames(article_mesh_term_mat) == rownames(mesh_term_module_mat))){
  print('Row and column order dont match!')
}

article_module_mat = as.matrix(article_mesh_term_mat) %*% mesh_term_module_mat
rownames(article_module_mat) = rownames(article_mesh_term_mat)

write.csv(article_module_mat, file='./../Data/article_module_matrix_wo_qualifiers_specific.csv')

print('Distribution of number of modules per article')
## [1] "Distribution of number of modules per article"
table(apply(article_module_mat, 1, function(x) sum(x>0)))
## 
##    1    2    3    4    5    6 
##  731 1245  842  374  122   30
melt_article_module = melt(article_module_mat)
colnames(melt_article_module) = c('articleID', 'module', 'value')
melt_article_module$module = as.factor(melt_article_module$module)

ggplotly(melt_article_module %>% ggplot(aes(value, fill=module, color=module)) +
         geom_histogram(position='identity', bins=max(melt_article_module$value), alpha=0.5) + 
         facet_grid(module~., scales='free_y') + ggtitle('Module content distribution per article') + 
         xlab('n_articles') + ylab('Module content') + theme_minimal() + theme(legend.position='none'))
rm(mesh_term_module, mesh_term_module_mat, i)